Species: Explore

Get Species Observations

1 and 2.

The species I will be using for this lab is the American Flamingo, Phoenicopterus ruber. There is an image of an American Flamingo taken by my dad.

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")

# get species occurrence data from GBIF with coordinates
res <- spocc::occ(
    query = 'Phoenicopterus ruber', 
    from = 'gbif', 
    has_coords = T,
    limit = 10000
)
  
df <- res$gbif$data[[1]]
  
clean_df <- df %>% 
  filter(latitude < 40, 
         longitude < 100) %>% 
  select(c("name", "longitude", "latitude", "prov", "key", "lifeStage", 
           "continent", "stateProvince", "year", "species"))

nrow(clean_df) # number of rows
## [1] 9995
readr::write_csv(clean_df, obs_csv)
obs <- clean_df %>% 
  sf::st_as_sf(
    coords = c("longitude", "latitude"),
    crs = st_crs(4326)) %>% 
  select(prov, key) # save space (joinable from obs_csv)

sf::write_sf(obs, obs_geo, delete_dsn=T)

obs <- sf::read_sf(obs_geo)
nrow(obs)
## [1] 9995

3.

Mapped distribution of the points are shown below

# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")

4 and 5. Question 1 and 2

There are 85,299 observations of American Flamingoes in the GBIF database. After cleaning the 10,000 observations to remove 4 in zoos in Northern Europe, my map displayed 9,996 observations of flamingoes. I considered removing duplicates but flamingoes usually travel in large flocks so it is to be expected that there would be multiple at very similar coordinates.

Get Environmental Data

Presence

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing marine
env_datasets <- sdmpredictors::list_datasets(terrestrial = T, marine = F)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)

6. Question

When deciding on the environmental layers to use, I looked into what factors affected the species distribution of flamingos. The most common factors are temperature, depth of water and access to food sources. Because of this, I chose the annual mean temperature, mean daily temperature, temperature seasonality, annual precipitation, terrain and topographic wetness layers.

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_bio1", "WC_bio2", "WC_bio4", "WC_bio12", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
plot(env_stack, nc=2)

7. Plot of environmental rasters

# obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
  
# save obs hull
write_sf(obs_hull, "data/sdm/obs_hull.geojson")
obs_hull <- read_sf("data/sdm/obs_hull.geojson")

# show points on map
mapview(list(obs, obs_hull))
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
  raster::crop(extent(obs_hull_sp))

writeRaster(env_stack, env_stack_grd, overwrite=T)  
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

8. Map of pseudo-absence points

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

# get raster count of observations
r_obs <- rasterize(
  sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)

absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
  as_tibble() %>% 
  st_as_sf(coords = c("x", "y"), crs = 4326)

write_sf(absence, absence_geo, delete_dsn=T)

absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
# combine presence and absence into single set of labeled points 
pts <- rbind(
  obs %>% 
    mutate(
      present = 1) %>% 
    select(present, key),
  absence %>% 
    mutate(
      present = 0,
      key     = NA)) %>% 
  mutate(
    ID = 1:n()) %>% 
  relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)

# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
  tibble() %>% 
  # join present and geometry columns to raster value results for points
  left_join(
    pts %>% 
      select(ID, present),
    by = "ID") %>% 
  relocate(present, .after = ID) %>% 
  # extract lon, lat as single columns
  mutate(
    #present = factor(present),
    lon = st_coordinates(geometry)[,1],
    lat = st_coordinates(geometry)[,2]) %>% 
  select(-geometry)
write_csv(pts_env, pts_env_csv)

pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

9. Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

Species: Regress

10. Plot of GGPairs

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

Logistic Regression

Setup Data

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19666

11. Linear Model

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3436 -0.2330 -0.0042  0.2660  1.4528 
## 
## Coefficients:
##                 Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)  2.337427114  0.047220821  49.500 < 0.0000000000000002 ***
## WC_bio1     -0.051314935  0.000998385 -51.398 < 0.0000000000000002 ***
## WC_bio2     -0.079296938  0.001190722 -66.596 < 0.0000000000000002 ***
## WC_bio4      0.000855992  0.000239617   3.572             0.000355 ***
## WC_bio12    -0.000248862  0.000004093 -60.800 < 0.0000000000000002 ***
## ER_tri      -0.004050119  0.000206591 -19.605 < 0.0000000000000002 ***
## ER_topoWet   0.033489066  0.003434389   9.751 < 0.0000000000000002 ***
## lon         -0.003750310  0.000088678 -42.291 < 0.0000000000000002 ***
## lat         -0.004713234  0.000241597 -19.509 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.348 on 19657 degrees of freedom
## Multiple R-squared:  0.5156, Adjusted R-squared:  0.5154 
## F-statistic:  2615 on 8 and 19657 DF,  p-value: < 0.00000000000000022
y_predict <- predict(mdl, pts_env, type="response")
y_true    <- pts_env$present

range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1

12. Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7889  -0.4619  -0.0356   0.5161   4.1303  
## 
## Coefficients:
##                Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept) 16.56154144  0.49657377  33.352 < 0.0000000000000002 ***
## WC_bio1     -0.44578866  0.01157672 -38.507 < 0.0000000000000002 ***
## WC_bio2     -0.63734106  0.01291311 -49.356 < 0.0000000000000002 ***
## WC_bio4     -0.00543979  0.00231069  -2.354               0.0186 *  
## WC_bio12    -0.00196284  0.00004287 -45.783 < 0.0000000000000002 ***
## ER_tri      -0.05231856  0.00260975 -20.047 < 0.0000000000000002 ***
## ER_topoWet   0.23003147  0.03362956   6.840     0.00000000000791 ***
## lon         -0.02801416  0.00086082 -32.544 < 0.0000000000000002 ***
## lat         -0.04085363  0.00219643 -18.600 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27259  on 19665  degrees of freedom
## Residual deviance: 13558  on 19657  degrees of freedom
## AIC: 13576
## 
## Number of Fisher Scoring iterations: 6
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.0000001391132 0.9996481631125

13. GLM Term Plots

# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)

14 and 15. Generalized Additive Model and Term Plots

# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_bio1) + s(WC_bio2) + s(WC_bio4) + s(WC_bio12) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_bio1) + s(WC_bio2) + s(WC_bio4) + s(WC_bio12) + 
##     s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)   -76.20       8.36  -9.114 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq              p-value    
## s(WC_bio1)    7.539  7.962 217.34 < 0.0000000000000002 ***
## s(WC_bio2)    8.821  8.981 439.53 < 0.0000000000000002 ***
## s(WC_bio4)    8.988  9.000 265.35 < 0.0000000000000002 ***
## s(WC_bio12)   8.028  8.113 722.01 < 0.0000000000000002 ***
## s(ER_tri)     6.818  7.530  34.26            0.0000184 ***
## s(ER_topoWet) 7.815  8.528  54.90 < 0.0000000000000002 ***
## s(lon)        8.973  8.999 373.82 < 0.0000000000000002 ***
## s(lat)        8.815  8.981 583.92 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.874   Deviance explained = 83.6%
## UBRE = -0.76618  Scale est. = 1         n = 19666
# show term plots
plot(mdl, scale=0)

16. Question

Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?

WC_bio2 (mean daily temperature range) and lat (latitude) are the two GAM environmental variables that seem to contribute most towards presence. They both have periods where they are below zero, but when comparing them to the other variables they contribute more towards presence.

Maximum Entropy

Maxent is probably the most commonly used species distribution model since it performs well with few input data points, only requires presence points and is easy to use with a Java graphical user interface (GUI).

17. Maxent Models

# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

18 and 19. Maxent Variable Contribution Plot and Term Plots

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
plot(mdl)

# plot term plots
response(mdl)

20. Question

Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?

For the Maxent variables, WC_bio1 (annual mean temperature) and WC_bio2 (mean daily temperature range) seem to contribute most to presence. This is the same in the case of WC_bio2 but WC_bio1 differs from the GAM where it contributes more to absences.

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Species: Trees

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 19666
Number of columns 9
_______________________
Column type frequency:
factor 1
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9975, 1: 9691

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_bio1 0 1 22.62 4.35 4.90 18.40 24.20 26.30 30.60 ▁▁▇▆▇
WC_bio2 0 1 11.53 3.17 4.20 9.30 11.20 14.10 19.70 ▂▇▇▆▁
WC_bio4 0 1 26.18 19.47 1.44 8.82 20.96 37.50 87.85 ▇▆▂▂▁
WC_bio12 0 1 961.85 722.72 4.00 480.00 719.00 1383.00 7428.00 ▇▂▁▁▁
ER_tri 0 1 14.50 21.63 0.00 2.88 7.86 16.07 269.51 ▇▁▁▁▁
ER_topoWet 0 1 12.21 1.37 6.88 11.36 12.34 13.33 15.72 ▁▂▆▇▁
lon 0 1 -26.43 48.88 -117.14 -73.55 -8.46 19.46 40.04 ▃▅▁▂▇
lat 0 1 0.42 21.27 -34.71 -21.12 4.04 20.29 39.21 ▆▃▅▇▃

Split data into training and testing

21. Tabular counts of 1 vs 0 before and after split

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9975 9691
table(d_train$present)
## 
##    0    1 
## 7980 7752

Decision Trees

Partition, depth = 1

22. Rpart model output and plot, depth=1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15732 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15732 7752 0 (0.50724638 0.49275362)  
##   2) lat>=-25.49979 12253 4597 0 (0.62482657 0.37517343) *
##   3) lat< -25.49979 3479  324 1 (0.09313021 0.90686979) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

Partition, depth=default

23. Rpart model output and plot, depth=default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15732 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 15732 7752 0 (0.50724638 0.49275362)  
##    2) lat>=-25.49979 12253 4597 0 (0.62482657 0.37517343)  
##      4) lon>=-68.16808 6663  610 0 (0.90844965 0.09155035)  
##        8) WC_bio1>=17.55 6404  434 0 (0.93222986 0.06777014)  
##         16) lon>=-61.40179 5776  228 0 (0.96052632 0.03947368) *
##         17) lon< -61.40179 628  206 0 (0.67197452 0.32802548)  
##           34) lat< 10.12695 411    1 0 (0.99756691 0.00243309) *
##           35) lat>=10.12695 217   12 1 (0.05529954 0.94470046) *
##        9) WC_bio1< 17.55 259   83 1 (0.32046332 0.67953668) *
##      5) lon< -68.16808 5590 1603 1 (0.28676208 0.71323792)  
##       10) WC_bio12>=1504 673   28 0 (0.95839525 0.04160475) *
##       11) WC_bio12< 1504 4917  958 1 (0.19483425 0.80516575)  
##         22) lon< -91.58806 649   46 0 (0.92912173 0.07087827) *
##         23) lon>=-91.58806 4268  355 1 (0.08317713 0.91682287)  
##           46) WC_bio1< 19.75 151    8 0 (0.94701987 0.05298013) *
##           47) WC_bio1>=19.75 4117  212 1 (0.05149381 0.94850619) *
##    3) lat< -25.49979 3479  324 1 (0.09313021 0.90686979) *
rpart.plot(mdl)

24. Rpart complexity parameter plot

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.36519608      0 1.0000000 1.0000000 0.008089145
## 2 0.30753354      1 0.6348039 0.6351909 0.007502845
## 3 0.07959236      2 0.3272704 0.3276574 0.005953419
## 4 0.07185243      3 0.2476780 0.2521930 0.005337587
## 5 0.01741486      4 0.1758256 0.1769866 0.004565082
## 6 0.01229790      5 0.1584107 0.1597007 0.004356610
## 7 0.01000000      8 0.1215170 0.1231940 0.003863573

25. Question:

Based on the complexity plot threshold, what size of tree is recommended? Based on the threshold, the tree is recommended to have a size of 10.

Feature interpretation

26. Rpart variable importance plot

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

27. Question

What are the top 3 most important variables of your model?

Based on the variable importance plot, the top three most important variables are latitude, WC_bio2 (daily temperature range), and longitude.

# commented these out because they were causing my code to crash. 
# Construct partial dependence plots
# p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
# p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
# p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
#   plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
#               colorkey = TRUE, screen = list(z = -20, x = -60))
# 
# # Display plots side by side
# gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

Random Forests

Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1149846

Feature Interpretation

28. RandomForest variable importance

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

29. Question:

How might variable importance differ between rpart and RandomForest in your model outputs? Variable importance for rpart and impurity based RandomForest modeling remained consistent, but for permutation based random forest, WC_bio2 became more important than WC_bio12.

Species: Evaluate

1.1 Split observations into training and testing

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()

pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

2 Calibrate: Model Selection

30 and 31. Plot of pairs from environmental stack and VIF per variable

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# each point in the scatter plot on the lower triangle represents a pixel that is placed on the x and y axis based on the variables it is comparing. 
# the correlation is how tight the correspondance of the scatter plot is 
# the numbers are smaller where there is less correlation 

# the highest vif is the highest multicorrelation 

# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
##    Variables      VIF
## 1    WC_bio1 1.494759
## 2    WC_bio2 2.330385
## 3    WC_bio4 2.499115
## 4   WC_bio12 2.314188
## 5     ER_tri 3.234874
## 6 ER_topoWet 3.748889
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 1 variables from the 6 input variables have collinearity problem: 
##  
## ER_topoWet 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( ER_tri ~ WC_bio12 ):  0.07962199 
## max correlation ( WC_bio12 ~ WC_bio2 ):  -0.6979262 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1   WC_bio1 1.388594
## 2   WC_bio2 2.329276
## 3   WC_bio4 2.437081
## 4  WC_bio12 2.334120
## 5    ER_tri 1.290442
# gives you a reduced set of predictors 

32 and 33. Variables after VIF collinearity removal anf Plot of pairs after VIF collinearity removal

# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

# using the v above, none of them should have a correlation coefficient grater than 0.7 because of the threshold. 

34. Plot of variable contribution

# fit a maximum entropy model
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
## This is MaxEnt version 3.4.3
readr::write_rds(mdl_maxv, mdl_maxv_rds)

mdl_maxv <- read_rds(mdl_maxv_rds)

# plot variable contributions per predictor
plot(mdl_maxv)

# plot term plots
response(mdl_maxv)

35. Question:

Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model? WC_bio4 and ER_topowet were removed due to multicollinearity. The importance ranking is WC_bio1, WC_bio2, WC_bio12 and ER_tri with bio1 being the most important.

36 and 37. Maxent term plots and Map of Maxent prediction

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Evaluate: Model Performance

3.1 Area Under the Curve (AUC), Reciever Operater Characteristic (ROC) Curve and Confusion Matrix

38 - 41. ROC threshold value maximizing specificity and sensitivity, Confusion matrix with percentages, AUC plot and Map of binary habitat

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

# this is a prediction and youre comparing the prediction to our known presence and absence points. 
e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1933 
## n absences     : 1996 
## AUC            : 0.8805386 
## cor            : 0.6780314 
## max TPR+TNR at : 0.6543424
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6543424
# of the presences observed, how many true predictions are we getting. 
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
# of the absences observed, how many true predictions are we getting 
# anything less than the threshold will be a 0
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)


# TPR & TNR is true positive and negative rates 
# (t)rue/(f)alse (p)ositive/(n)egative rates
# p_true is a vector of trues and falses. 
# this gives us a rate 
tpr <- sum(p_true)/length(p_true)

# how many we observed were present but we predicted absence 
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
#the terms above populate the matrix below 
matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs    0.7982411   0.1172345
## absent_obs     0.2017589   0.8827655
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")

# any of the points along the curve represent a different value for 0 and 1 which is different than the axis 


# Ymax v is the predicted maxent from the environmental stack 
# applying the threshold to give you the distribution of 1s in green and 0s in grey for habitat no habitat. 
plot(y_maxv > thr)